Question 1

A

library(ggplot2)
library(tidyverse)

rm(list=ls())

# Define x
x = seq(0,1, length.out=100)

#x = rnorm(100,0,1)
# We choose equally spaced notes over the range [0,1]
knots = seq(0,1,length.out=8) # we set this to 8 as the first and last knots will be removed, so this ensures we are left with 6

# Basis functions per Woods formulation

b1 = function(x) {
  rep(1, length(x))
}

b2 = function(x){
  x
}

R_x_z = function(x, z) {
  ((z - 0.5)^2 - 1/12) * ((x - 0.5)^2 - 1/12) / 4 - 
  ((abs(x - z) - 0.5)^4 - 0.5 * (abs(x - z) - 0.5)^2 + 7/240) / 24
}

# Construct the basis matrix

basis_matrix = cbind(b1(x), b2(x),sapply(knots[-c(1,length(knots))],function(k) R_x_z(x,k)))

basis_design_matrix = basis_matrix %>% as_tibble()
                  
colnames(basis_design_matrix) = c("b1", "b2", "b3", "b4", paste0("b", 5:(4 + length(knots))))
basis_design_matrix = basis_design_matrix %>%
                       mutate(x = x)

basis_melted <- reshape2::melt(basis_design_matrix, id.vars = "x")

ggplot(basis_melted, aes(x = x, y = value, color = variable)) + 
  geom_line(size = 1) + 
  labs(title = "Wood Cubic Spline Basis",
       x = "x", y = "Basis Function Value", color = "Basis") +
  theme_minimal()

B

set.seed(123)
n <- 100
x = rnorm(n,0,1)

y <- 5 + sin(3 * pi * (x-0.6)) + rnorm(n, sd = 0.5^2)


knots <- seq(min(x), max(x), length.out = 32) # We define length of 32 because the first and last knots will be removed, to ensure we have exactly 30 knots

# Construct penalty matrix
knot_positions <- knots[-c(1, length(knots))]
n_knots = length(knot_positions)
S = matrix(0, n_knots, n_knots)
for(i in 1:n_knots){
  for (j in 1: n_knots){
    S[i,j] = R_x_z(knot_positions[i], knot_positions[j])
  }
}

penalized_regression_spline = function(lambda, S, knot_positions, y){
  basis_matrix <- cbind(
  b1(x),
  b2(x),
  sapply(knot_positions, function(k) R_x_z(x, k))
)
  P = lambda * S
  X  = basis_matrix[, 3:ncol(basis_matrix)]
  XtX_plus_p = t(X)%*%X + P 
  XtY = t(X)%*%y
  beta_hat = solve(XtX_plus_p)%*%XtY
  fitted_values = X %*% beta_hat
  H = X %*% solve(XtX_plus_p) %*% t(X)
  se_errors = sqrt(diag(H))
  return (list(fitted_values = fitted_values , se_errors = se_errors))
}

fit_result = penalized_regression_spline(0.00000001, S, knot_positions,y) # choose a random lambda value

plot_data <- data.frame(x = x, y = y, y_hat = fit_result$fitted_values, se_errors=fit_result$se_errors)

ggplot(plot_data, aes(x = x, y = y)) + 
  geom_point(color = 'black', alpha = 0.6) +
  geom_line(aes(y = y_hat), color = 'blue', size = 1.2) +
  labs(title = "Penalized Regression Spline Fit",
       x = "X",
       y = "Y") +
  theme_minimal()

C

# (95% confidence intervals)
upper <- fit_result$fitted_values + 1.96 * fit_result$se_errors
lower <- fit_result$fitted_values - 1.96 * fit_result$se_errors


plot_data <- data.frame(x = x, y = y, y_hat = fit_result$fitted_values, lower = lower, upper = upper)

ggplot(plot_data, aes(x = x, y = y)) + 
  geom_point(color = 'black', alpha = 0.6) +
  geom_line(aes(y = y_hat), color = 'blue', size = 1.2) +
  geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.1, fill = 'blue') +
  labs(title = "Penalized Regression Spline Fit with Confidence Bands",
       x = "X",
       y = "Y") +
  theme_minimal()

D

# Compute GCV as function of lambda
compute_gcv = function(lambda){
  basis_matrix <- cbind(
  b1(x),
  b2(x),
  sapply(knot_positions, function(k) R_x_z(x, k))
)
  P = lambda * S
  X  = basis_matrix[, 3:ncol(basis_matrix)]
  XtX_plus_p = t(X)%*%X + P 
  XtY = t(X)%*%y
  beta_hat = solve(XtX_plus_p)%*%XtY
  fitted_values = X %*% beta_hat
  
  # Compute residuals
  residuals = y - fitted_values
  H = X %*% solve(XtX_plus_p)%*%t(X)
  edf = sum(diag(H))
  
  # Compute the GCV score
  n = length(y)
  gcv = sum(residuals^2)/(n*(1-edf/n)^2)
  return (gcv)
}

lambda_values <- seq(0.00000001, 1, length.out = 5000)
gcv_scores <- sapply(lambda_values, compute_gcv)

gcv_data <- data.frame(lambda = lambda_values, gcv = gcv_scores)

ggplot(gcv_data, aes(x = lambda, y = gcv)) +
  geom_line(color = 'blue', size = 1.2) +
  labs(title = "GCV as a Function of the Smoothing Parameter",
       x = "Smoothing Parameter (lambda)",
       y = "GCV Score") +
  theme_minimal()

Question 2

Tensor Product Spline

library(gamair)
library(tidyverse)
library(dplyr)

rm(list=ls())

# Basis functions per Woods formulation

b1 = function(x) {
  rep(1, length(x))
}

b2 = function(x){
  x
}

R_x_z = function(x, z) {
  ((z - 0.5)^2 - 1/12) * ((x - 0.5)^2 - 1/12) / 4 - 
  ((abs(x - z) - 0.5)^4 - 0.5 * (abs(x - z) - 0.5)^2 + 7/240) / 24
}

data("brain")

brain_data = brain%>%as_tibble()%>%
             select(X,Y, medFPQ)

# Calculate tensor product spline
tensor_product_spline = function(x,y, response,x_knot_positions, y_knot_positions, x_penalty_matrix, y_penalty_matrix, lambda) {
  
  # Scale x and y
  x_scaled = scale(x)
  y_scaled = scale(y)
  
  y_data = as.matrix(response)
  x_basis_matrix <- cbind(sapply(x_knot_positions, function(k) R_x_z(x_scaled, k)))
  y_basis_matrix <- cbind(sapply(y_knot_positions, function(k) R_x_z(y_scaled, k)))
  
  tensor_basis_matrix = compute_tensor_product(x_basis_matrix, y_basis_matrix)
  tensor_penalty_matrix = compute_tensor_product(x_penalty_matrix, y_penalty_matrix)
  
  P = lambda * tensor_penalty_matrix
  X  = tensor_basis_matrix[1:nrow(y_data),]
  XtX_plus_p = t(X)%*%X + P 
  
  XtY = t(X)%*%y_data
  beta_hat = MASS::ginv(XtX_plus_p)%*%XtY
  fitted_values = X %*% beta_hat
  H = X %*% MASS::ginv(XtX_plus_p) %*% t(X)
  se_errors = sqrt(diag(H))
  return (list(fitted_values = fitted_values , se_errors = se_errors))
}
# Tensor product helper function
compute_tensor_product = function(A, B){
  m1 = ncol(A)
  m2 = ncol(B)
  G = matrix(NA, nrow=nrow(A)*nrow(B), ncol = m1 * m2 )
  ccol <- 1
  for (j in 1:m1) {
    for (k in 1:m2) {
      G[, ccol] <- A[, j] * B[, k]
      ccol <- ccol + 1
    }
  }
  return (G)
}

x_knots <- seq(min(brain_data$X), max(brain$X), length.out = 10) 
y_knots <- seq(min(brain_data$Y), max(brain$Y), length.out = 10) 

# Construct penalty matrix
construct_penalty_matrix <- function(knots) {
  n_knots <- length(knots)
  S <- matrix(0, n_knots, n_knots)
  for (i in 1:n_knots) {
    for (j in 1:n_knots) {
      S[i, j] <- R_x_z(knots[i], knots[j])
    }
  }
  S
}
S_x = construct_penalty_matrix(x_knots)
S_y = construct_penalty_matrix(y_knots)

tensor_product_spline_result = tensor_product_spline(
  brain_data$X, 
  brain_data$Y, 
  brain_data$medFPQ,
  x_knots, 
  y_knots,
  S_x,
  S_y,
  0.00001
)


# ggplot(brain_data, aes(x = X, y = Y)) +
#   geom_point(aes(color = medFPQ), size = 2, alpha = 0.6) +
#   scale_color_viridis_c() +
#   labs(title = "Original Data",
#        x = "X",
#        y = "Y",
#        color = "Observed medFPQ") +
#   theme_minimal()

brain_data <- brain_data %>% mutate(Fitted = tensor_product_spline_result$fitted_values)


# Visualize the observed medFPQ values
ggplot(brain_data, aes(x = X, y = Y)) +
  geom_point(aes(color = medFPQ), size = 2, alpha = 0.6) +
  scale_color_viridis_c() +
  labs(title = "Observed medFPQ",
       x = "X",
       y = "Y",
       color = "Observed medFPQ") +
  theme_minimal()

ggplot(brain_data, aes(x = X, y = Y)) +
  geom_tile(aes(fill = Fitted)) +
  scale_fill_viridis_c() +
  labs(title = "Tensor Product Spline Fit",
       x = "X",
       y = "Y",
       fill = "Fitted medFPQ") +
  theme_minimal()

plot_data <- data.frame(
  X = brain_data$X,
  Y = brain_data$Y,
  Original = brain_data$medFPQ,
  Fitted = tensor_product_spline_result$fitted_values
)

plot_data_long <- tidyr::pivot_longer(plot_data, 
                                      cols = c(Original, Fitted),
                                      names_to = "Type",
                                      values_to = "Value")

plot_data_long$Type <- factor(plot_data_long$Type, levels = c("Original", "Fitted"))


# ggplot(plot_data_long, aes(x = X, y = Y, fill = Value)) +
#   geom_tile() +
#   facet_wrap(~ Type) +
#   scale_fill_viridis_c() +
#   labs(title = "Tensor Product Original vs Fitted Values",
#        x = "X coordinate",
#        y = "Y coordinate",
#        fill = "medFPQ") +
#   theme_minimal() +
#   theme(aspect.ratio = 1)

Thin Plate Spline

library(gamair)
library(tidyverse)
library(dplyr)

rm(list=ls())

data("brain")

brain_data = brain %>% as_tibble() %>%
             select(X, Y, medFPQ)

# Radial Basis function
tps_basis = function(r){
  ifelse(r > 0, r^2 * log(r), 0)
}

create_design_matrix = function(data, knots){
  n = nrow(data)
  m = nrow(knots)
  # Create matrices of coordinates
  data_mat <- as.matrix(data[, 1:2])
  knots_mat <- as.matrix(knots[, 1:2])
  
  # Compute pairwise distances
  distances <- fields::rdist(data_mat, knots_mat)
  
  B <- apply(distances, c(1,2), tps_basis)
  
  A = cbind(1, data)
  X = cbind(A, B)
  return (list(X = X, A = A, B = B))
}

# Compute Thin Plate Spline
thin_plate_spline = function(data, lambda, num_knots){
  n = nrow(data)
  knot_indices = seq(1, n, length.out = num_knots) 
  knots = data[knot_indices, 1:2]
  
  design_matrix = create_design_matrix(data[,1:2], knots)
  X = as.matrix(design_matrix$X)
  A = design_matrix$A
  B = design_matrix$B
  
  m = ncol(B)
  P = rbind(
    cbind(matrix(0, 3, 3), matrix(0, 3, m)),
    cbind(matrix(0, m, 3), B[knot_indices,])
  )
  y = as.matrix(data[, 3])
  XtX_plus_p = t(X) %*% X + (lambda * P)
  XtY = t(X) %*% y
  beta_hat = solve(XtX_plus_p) %*% XtY
  fitted_values = X %*% beta_hat
  colnames(fitted_values) = "Fitted"
  H = X %*% solve(XtX_plus_p) %*% t(X)
  se_errors = sqrt(diag(H))
  return (list(fitted_values = fitted_values, se_errors = se_errors))
}

tps_result = thin_plate_spline(brain_data, 0.5, 10)

#head(tps_result$fitted_values)

brain_data <- brain_data %>% mutate(Fitted = tps_result$fitted_values)

ggplot(brain_data, aes(x = X, y = Y)) +
  geom_tile(aes(fill = Fitted)) +
  scale_fill_viridis_c() +
  labs(title = "Thin-Plate Spline Fit",
       x = "X",
       y = "Y",
       fill = "Fitted medFPQ") +
  theme_minimal()

plot_data <- data.frame(
  X = brain_data$X,
  Y = brain_data$Y,
  Original = brain_data$medFPQ,
  Fitted = tps_result$fitted_values
)

plot_data_long <- tidyr::pivot_longer(plot_data, cols = c(Original, Fitted), names_to = "Type", values_to = "medFPQ")

plot_data_long$Type <- factor(plot_data_long$Type, levels = c("Original", "Fitted"))

p = ggplot(plot_data_long, aes(x = X, y = Y, fill = medFPQ)) +
  geom_tile() +
  facet_wrap(~ Type) +
  scale_fill_viridis_c() +
  labs(title = "Thin-Plate Spline: Original vs Fitted Values",
       x = "X", 
       y = "Y", 
       fill = "medFPQ") +
  theme_minimal() +
  theme(aspect.ratio = 1)

#print(p)

Tensor Product and Thin Plate Spline: Using mgcv function

library(gamair)
library(mgcv)
library(tidyverse)
library(ggplot2)

data(brain)
brain_data <- brain %>%
  as_tibble() %>%
  select(X, Y, medFPQ)

tp_model <- gam(medFPQ ~ te(X, Y, k = 10), data = brain_data)

tps_model <- gam(medFPQ ~ s(X, Y, bs = "tp", k = 100), data = brain_data)

summary(tp_model)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## medFPQ ~ te(X, Y, k = 10)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.24791    0.03003   41.55   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##           edf Ref.df     F p-value    
## te(X,Y) 90.09  91.84 9.934  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.347   Deviance explained = 38.4%
## GCV = 1.5007  Scale est. = 1.4135    n = 1567
summary(tps_model)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## medFPQ ~ s(X, Y, bs = "tp", k = 100)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.24791    0.03029   41.19   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##          edf Ref.df     F p-value    
## s(X,Y) 86.49  96.33 8.503  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.335   Deviance explained = 37.2%
## GCV = 1.5232  Scale est. = 1.4381    n = 1567
brain_data$Fitted_tp <- predict(tp_model, newdata = brain_data)
brain_data$Fitted_tps <- predict(tps_model, newdata = brain_data)

original_plot <- ggplot(brain_data, aes(x = X, y = Y, fill = medFPQ)) +
  geom_tile() +
  scale_fill_viridis_c() +
  labs(title = "Original Data",
       x = "X coordinate",
       y = "Y coordinate",
       fill = "medFPQ") +
  theme_minimal()

original_plot

tp_plot <- ggplot(brain_data, aes(x = X, y = Y, fill = Fitted_tp)) +
  geom_tile() +
  scale_fill_viridis_c() +
  labs(title = "Tensor-Product Spline Fit using mgcv",
       x = "X coordinate",
       y = "Y coordinate",
       fill = "Fitted medFPQ") +
  theme_minimal()

tp_plot

tps_plot <- ggplot(brain_data, aes(x = X, y = Y, fill = Fitted_tps)) +
  geom_tile() +
  scale_fill_viridis_c() +
  labs(title = "Thin-Plate Spline Fit using mgcv",
       x = "X coordinate",
       y = "Y coordinate",
       fill = "Fitted medFPQ") +
  theme_minimal()

tps_plot

Question 3: GAMs and MARS

GAM and Logistic Regression

library(mgcv)
library(caret)
rm(list = ls())
pulsar_data = read.csv2("Pulsar.csv", header = TRUE, sep=",")
head(pulsar_data)
##   Mean_Integrated          SD           EK     Skewness Mean_DMSNR_Curve
## 1        140.5625 55.68378214 -0.234571412 -0.699648398      3.199832776
## 2     102.5078125 58.88243001  0.465318154 -0.515087909      1.677257525
## 3      103.015625 39.34164944  0.323328365  1.051164429      3.121237458
## 4          136.75 57.17844874 -0.068414638 -0.636238369      3.642976589
## 5      88.7265625 40.67222541  0.600866079  1.123491692      1.178929766
## 6      93.5703125 46.69811352   0.53190485  0.416721117      1.636287625
##   SD_DMSNR_Curve EK_DMSNR_Curve Skewness_DMSNR_Curve Class
## 1    19.11042633    7.975531794          74.24222492     0
## 2    14.86014572    10.57648674          127.3935796     0
## 3    21.74466875    7.735822015          63.17190911     0
## 4     20.9592803     6.89649891          53.59366067     0
## 5     11.4687196    14.26957284          252.5673058     0
## 6    14.54507425     10.6217484          131.3940043     0
# Split the data into training and test sets (80% training, 20% test)
set.seed(10032024)
sample <- sample.int(n = nrow(pulsar_data), size = floor(.8*nrow(pulsar_data)), replace = F)
train <- pulsar_data[sample, ]
test <- pulsar_data[-sample, ]

# Separate features and target variable
X_train <- train[, !names(train) %in% "Class"]
y_train <- train$Class
X_test <- test[, !names(test) %in% "Class"]
y_test <- test$Class

# #Fit Logistic Regession Model (GLM)
# glm_model = glm(Class ~., data = train, family = binomial)
# glm_pred <- predict(glm_model, newdata = dataTest, type = "response")
# glm_class <- ifelse(glm_pred > 0.5, 1, 0)
# glm_confusion <- confusionMatrix(as.factor(glm_class), as.factor(dataTest$Class))
# glm_confusion
# 
# #Fit GAM Model
# gam_model <- gam(Class ~ s(Mean_Integrated) + s(SD) + s(EK) + s(Skewness) + s(Mean_DMSNR_Curve) + s(SD_DMSNR_Curve) + s(EK_DMSNR_Curve) + s(Skewness_DMSNR_Curve), data = training_data, family = binomial)
# gam_pred <- predict(gam_model, newdata = dataTest, type = "response")
# gam_class <- ifelse(gam_pred > 0.5, 1, 0)
# gam_confusion <- confusionMatrix(as.factor(gam_class), as.factor(dataTest$Class))
# gam_confusion

MARS

library(earth)
library(caret)
library(ggplot2)
rm(list = ls())

pulsar_data <- read.csv("Pulsar.csv", header = TRUE, sep = ",")

X <- pulsar_data[, !(names(pulsar_data) %in% c("Class"))]
y <- pulsar_data$Class

# Split the data into training (80%) and testing (20%) sets
set.seed(5749)
trainIndex <- createDataPartition(y, p = 0.8, list = FALSE)
X_train <- X[trainIndex, ]
X_test <- X[-trainIndex, ]
y_train <- y[trainIndex]
y_test <- y[-trainIndex]

train_data <- data.frame(X_train, target = y_train)

# Fit MARS model
mars_model <- earth(target ~ ., data = train_data)

# Predict on the test data
test_data <- data.frame(X_test)
y_pred_prob <- predict(mars_model, test_data)

y_pred <- ifelse(y_pred_prob > 0.5, 1, 0)

# Calculate the confusion matrix
conf_matrix <- confusionMatrix(as.factor(y_pred), as.factor(y_test))
print(conf_matrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 3224   68
##          1   15  272
##                                           
##                Accuracy : 0.9768          
##                  95% CI : (0.9713, 0.9815)
##     No Information Rate : 0.905           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.855           
##                                           
##  Mcnemar's Test P-Value : 1.145e-08       
##                                           
##             Sensitivity : 0.9954          
##             Specificity : 0.8000          
##          Pos Pred Value : 0.9793          
##          Neg Pred Value : 0.9477          
##              Prevalence : 0.9050          
##          Detection Rate : 0.9008          
##    Detection Prevalence : 0.9198          
##       Balanced Accuracy : 0.8977          
##                                           
##        'Positive' Class : 0               
## 
# Extract performance metrics
accuracy <- conf_matrix$overall['Accuracy']
sensitivity <- conf_matrix$byClass['Sensitivity']
specificity <- conf_matrix$byClass['Specificity']
precision <- conf_matrix$byClass['Precision']
f1 <- conf_matrix$byClass['F1']

# Create a data frame to store the metrics
metrics <- data.frame(
  Metric = c("Accuracy", "Sensitivity", "Specificity", "Precision", "F1 Score"),
  Value = c(accuracy, sensitivity, specificity, precision, f1)
)
#print(metrics)

results <- data.frame(Actual = y_test, Predicted_Prob = y_pred_prob)
#results

Question 4: Wavelets

library(fds)
library(wavelets)
library(glmnet)
library(ggplot2)
library(wavethresh)
library(tidyr)

rm(list = ls())

data(aa)
data(ao)

aa_data <- aa$y
ao_data <- ao$y

X <- rbind(aa_data, ao_data)
y <- c(rep(0, nrow(aa_data)), rep(1, nrow(ao_data)))

# Wavelet transform function
apply_wavelet <- function(data, n_coef = 256) {
  next_power_of_2 <- 2^ceiling(log2(length(data)))
  padded_data <- c(data, rep(0, next_power_of_2 - length(data)))
  
  # Compute Wavelet coefficients
  wt <- wd(padded_data, filter.number = 10, family = "DaubLeAsymm")
  all_levels <- 0:(nlevelsWT(wt) - 1)
  all_coefs <- unlist(lapply(all_levels, function(l) accessD(wt, level = l)))
  
  return(all_coefs[1:min(length(all_coefs), n_coef)])
}

# Apply wavelet transform to each row
X_wavelet <- t(apply(X, 1, apply_wavelet))

# Plot a sample of signals and their corresponding wavelet coefficients
plot_samples <- function(data, wavelet_data, title_signal, title_wavelet, sample_size = 10) {
  sampled_indices <- sample(nrow(data), sample_size)
  sampled_data <- data[sampled_indices, ]
  sampled_wavelet_data <- wavelet_data[sampled_indices, ]
  
  # Plot signals
  signal_data <- as.data.frame(t(sampled_data))
  signal_data$Index <- 1:nrow(signal_data)
  signal_data_long <- pivot_longer(signal_data, -Index, names_to = "Sample", values_to = "Amplitude")
  
  plot_signal <- ggplot(signal_data_long, aes(x = Index, y = Amplitude, color = Sample)) +
    geom_line(alpha = 0.5) +
    labs(title = title_signal, x = "Index", y = "Amplitude") +
    theme_minimal()
  
  # Plot wavelet coefficients
  coeff_data <- as.data.frame(t(sampled_wavelet_data))
  coeff_data$Index <- 1:nrow(coeff_data)
  coeff_data_long <- pivot_longer(coeff_data, -Index, names_to = "Sample", values_to = "Coefficient")
  
  plot_wavelet <- ggplot(coeff_data_long, aes(x = Index, y = Coefficient, color = Sample)) +
    geom_line(alpha = 0.5) +
    labs(title = title_wavelet, x = "Coefficient Index", y = "Coefficient Value") +
    theme_minimal()
  
  return(list(signal_plot = plot_signal, wavelet_plot = plot_wavelet))
}

# Plot for 'aa' class
aa_plots <- plot_samples(aa_data, t(apply(aa_data, 1, apply_wavelet)), 
                         "Sample of Signals from 'aa' Class", 
                         "Sample of Wavelet Coefficients for 'aa' Class")
print(aa_plots$signal_plot)

print(aa_plots$wavelet_plot)

# Plot for 'ao' class
ao_plots <- plot_samples(ao_data, t(apply(ao_data, 1, apply_wavelet)), 
                         "Sample of Signals from 'ao' Class", 
                         "Sample of Wavelet Coefficients for 'ao' Class")
print(ao_plots$signal_plot)

print(ao_plots$wavelet_plot)

# Split data
set.seed(123)
total_samples <- nrow(X_wavelet)
train_size <- floor(0.7 * total_samples)
train_indices <- sample(1:total_samples, train_size)

X_train <- X_wavelet[train_indices, ]
X_test <- X_wavelet[-train_indices, ]
y_train <- y[train_indices]
y_test <- y[-train_indices]

# Fit the model
cv_fit <- cv.glmnet(X_train, y_train, family = "binomial", alpha = 1)
best_lambda <- cv_fit$lambda.min
model <- glmnet(X_train, y_train, family = "binomial", alpha = 1, lambda = best_lambda)


# Make predictions
predictions <- predict(model, X_test, type = "response")
predicted_classes <- ifelse(predictions > 0.5, 1, 0)

# Calculate accuracy
accuracy <- mean(predicted_classes == y_test)
print(paste("Accuracy:", accuracy))
## [1] "Accuracy: 1"
# Create confusion matrix
conf_matrix <- table(Predicted = predicted_classes, Actual = y_test)
print("Confusion Matrix:")
## [1] "Confusion Matrix:"
print(conf_matrix)
##          Actual
## Predicted  0  1
##         0 43  0
##         1  0 47
# Calculate evaluation metrics
precision <- conf_matrix[2, 2] / (conf_matrix[2, 2] + conf_matrix[2, 1])
recall <- conf_matrix[2, 2] / (conf_matrix[2, 2] + conf_matrix[1, 2])
f1_score <- 2 * precision * recall / (precision + recall)

metrics <- data.frame(
  Metric = c("Accuracy", "Precision", "Recall", "F1 Score"),
  Value = c(accuracy, precision, recall, f1_score)
)

print(metrics)
##      Metric Value
## 1  Accuracy     1
## 2 Precision     1
## 3    Recall     1
## 4  F1 Score     1
# Get feature importances
coef_importance <- abs(coef(model)[-1])
top_features <- order(coef_importance, decreasing = TRUE)[1:10]

print("Top 10 most important wavelet coefficients:")
## [1] "Top 10 most important wavelet coefficients:"
print(top_features)
##  [1]  83  19 144  88  12  75 204 142 104  37
# Plot feature importances
ggplot(data.frame(Index = 1:length(coef_importance), Importance = coef_importance), 
       aes(x = Index, y = Importance)) +
  geom_point() +
  geom_point(data = data.frame(Index = top_features, Importance = coef_importance[top_features]), 
             aes(x = Index, y = Importance), color = "red", size = 3) +
  labs(title = "Wavelet Coefficient Importances", x = "Coefficient Index", y = "Absolute Coefficient Value") +
  theme_minimal()

# Extract and plot the non-zero coefficients
non_zero_coeffs <- coef(model)[coef(model) != 0]
non_zero_indices <- which(coef(model) != 0)

coeff_data <- data.frame(Index = non_zero_indices, Coefficient = non_zero_coeffs)

# Plot the non-zero coefficients
ggplot(coeff_data, aes(x = Index, y = Coefficient)) +
  geom_bar(stat = "identity") +
  labs(title = "Non-zero Coefficients from Penalized Logistic Regression",
       x = "Wavelet Coefficient Index",
       y = "Coefficient Value") +
  theme_minimal()

Question 5: Functional Data Analysis

library(gamair)
library(ggplot2)
library(splines)
rm(list = ls())

data(gas)

# Extract the spectra and octane ratings
nir_spectra <- matrix(as.numeric(unlist(gas$NIR)), nrow = 60, byrow = TRUE)
colnames(nir_spectra) <- colnames(gas$NIR)
nir_spectra <- t(nir_spectra)  # Transpose to have 401 rows and 60 columns
octane <- gas$octane

# Define the B-spline basis functions
basis_functions <- bs(seq(900, 1700, by = 2), df = 20, degree = 3, intercept = TRUE)

# Define function to implement basis expansion
basis_expansion <- function(data, basis) {
  expanded_data <- matrix(0, nrow = ncol(basis), ncol = ncol(data))
  for (i in 1:ncol(data)) {
    expanded_data[, i] <- t(basis) %*% data[, i]
  }
  return(expanded_data)
}

# Apply basis expansion
nir_fd_coefs <- basis_expansion(nir_spectra, basis_functions)


nir_fd_coefs_df <- as.data.frame(t(nir_fd_coefs))
colnames(nir_fd_coefs_df) <- paste0("X", 1:ncol(nir_fd_coefs_df))

model_data <- cbind(octane = octane, nir_fd_coefs_df)

# Fit the linear model
formula <- as.formula(paste("octane ~", paste(colnames(nir_fd_coefs_df), collapse = " + "), "- 1"))
lm_model <- lm(formula, data = model_data)

summary(lm_model)
## 
## Call:
## lm(formula = formula, data = model_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -21.37  13.12  54.24  81.60 130.43 
## 
## Coefficients:
##     Estimate Std. Error t value Pr(>|t|)  
## X1   -1907.9     1025.8  -1.860   0.0703 .
## X2    1149.6     1948.9   0.590   0.5586  
## X3    -656.2     2360.8  -0.278   0.7825  
## X4     949.0     1831.7   0.518   0.6072  
## X5     570.8     2122.2   0.269   0.7893  
## X6   -1459.1     1701.9  -0.857   0.3964  
## X7    -190.4     2276.4  -0.084   0.9338  
## X8    -595.2     2231.0  -0.267   0.7910  
## X9    1507.7     2110.0   0.715   0.4790  
## X10    426.0     3048.5   0.140   0.8896  
## X11   1389.5     2558.7   0.543   0.5901  
## X12  -3365.9     2972.6  -1.132   0.2643  
## X13   -132.3     2817.8  -0.047   0.9628  
## X14   -351.4     2508.1  -0.140   0.8893  
## X15   4001.0     2626.5   1.523   0.1356  
## X16  -2283.2     2597.2  -0.879   0.3846  
## X17    744.2     2049.0   0.363   0.7184  
## X18  -2820.2     2314.7  -1.218   0.2302  
## X19   2217.7     1714.6   1.293   0.2033  
## X20    889.6      913.6   0.974   0.3360  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 83.7 on 40 degrees of freedom
## Multiple R-squared:  0.3856, Adjusted R-squared:  0.07842 
## F-statistic: 1.255 on 20 and 40 DF,  p-value: 0.2637
# First principles prediction using the fitted coefficients
fitted_coefficients <- coef(lm_model)
octane_pred_manual <- as.matrix(nir_fd_coefs_df) %*% fitted_coefficients

results <- data.frame(Actual = octane, Predicted = octane_pred_manual)

# Plot actual vs predicted values
ggplot(results, aes(x = Actual, y = Predicted)) +
  geom_point(color = 'blue') +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  labs(title = "Actual vs Predicted Octane Ratings",
       x = "Actual Octane Rating",
       y = "Predicted Octane Rating") +
  theme_minimal()

print(fitted_coefficients)
##         X1         X2         X3         X4         X5         X6         X7 
## -1907.9197  1149.6367  -656.1988   948.9994   570.8192 -1459.0755  -190.3914 
##         X8         X9        X10        X11        X12        X13        X14 
##  -595.2310  1507.6761   426.0391  1389.5247 -3365.8842  -132.2741  -351.4226 
##        X15        X16        X17        X18        X19        X20 
##  4000.9962 -2283.1757   744.1567 -2820.2456  2217.7276   889.6180
nir_spectra_df <- as.data.frame(nir_spectra)
nir_spectra_df$Wavelength <- seq(900, 1700, by = 2)

nir_spectra_long <- tidyr::pivot_longer(nir_spectra_df, cols = -Wavelength, names_to = "Sample", values_to = "log1R")

ggplot(nir_spectra_long, aes(x = Wavelength, y = log1R, group = Sample, color = Sample)) +
  geom_line(alpha = 0.5) +
  labs(title = "NIR Spectra",
       x = "Wavelength (nm)",
       y = "log(1/R)") +
  theme_minimal() +
  theme(legend.position = "none")